####################################################################################################
#### Converting WOS data to csv  ###################################################################
####################################################################################################

# pasting all files into one and deleting all tags which are not needed
LC_ALL=C egrep --no-filename -R '^(FN|H8|PY|R9|T9|EX|UT|TI|AU|J1|SC|CF|UI|RE)' ../data/ > all_data_clean_tags

# limiting to papers from SSCI 
perl parse_ssci.pl ../source_files/all_data_clean_tags > ../processed_files/wos.csv

####################################################################################################
#### Defining waves ################################################################################
####################################################################################################

# starting with file "welle1" (wave1) which contains UT numbers for all "starting articles" 
# using a regular expression we shrink the source file to the starting articles
perl -ne 'print if (/(?^:^UT (?:A19(?:(?:8(?:4(?:S(?:(?:Q6840000|V5390001)2|R9590000[13]|D26800006|M43900004)|T(?:(?:K344|S482)00009|G54400002)|ADM3600006)|2(?:P(?:(?:(?:J4|N8)81|Q958)00001|R9920000[13]|E30900006)|N(?:W7060000[48]|M63200004|T77000005))|0(?:K(?:T(?:64800001|75900010)|A77700006|C19000011|H97400003)|J(?:E70100001|W32900007))|3(?:R(?:(?:P709|U856)00008|R02900001|S58700004)|S(?:M612|Q151)00008|QR63700004)|1(?:L(?:G40900006|P95300009|U03100011)|M(?:A11800008|M67700006|V88200001))|7(?:G(?:61240000[17]|182800001|939100004)|H(?:363700007|545000008))|8(?:P2(?:20200002|62400008)|(?:M339|N660)100007)|9(?:A(?:E13400015|R82100001)|U652600005)|5A(?:(?:WE890000|ZA720001)2|PZ2600005)|6(?:F01530000[57]|D638900005))|7(?:9(?:H(?:Y3(?:3900004|7500001)|C60900002|G42800004|M93700006)|G(?:H664000(?:04|10)|R75700001|W94400017))|1(?:J(?:405200013|552900004|695800001)|Y(?:091700002|257400001)|I770400002|K035700003)|8(?:G(?:B83100012|H42700006|K44300003)|F(?:D91700001|U33500016)|EW1020001[02])|3(?:Q(?:44440000[12]|305200001|662300005)|P(?:356000005|699300001))|6(?:C(?:(?:L8040001|W0580000)6|Q8920000[68]|C75700002)|BS28900041)|7(?:D(?:(?:G4920000|L6000001)2|(?:Q795|Z006)00003)|CV62600008)|4(?:A(?:C20400011|F88600005)|T079000005|U124400009)|0(?:G990800007|Y291400006)|2(?:M981700004|O174300015)|5AC98900003)|6(?:6(?:Z(?:A(?:08600006|16600007)|F78800003)|8499500001)|8Z(?:A(?:17600002|37900003)|E50800001|G70100006)|9(?:E(?:301800005|805700007)|Y446500014)|5C(?:A(?:K9900008|Z7900006)|FU1200004)|2C(?:AL8300004|FW8200001)|(?:4CFJ88|7ZA052)00002|1CAG8400006|3P160400003)|56CAG6200001)$|9(?:(?:2(?:H(?:A59000030|R13400014|Z50300008)|J(?:H62100004|V04500001|X86900010))|1(?:G(?:A36500009|B35900001|W04400003)|F(?:J3640001|N0130000)3)|0(?:D(?:A38900060|D63700001|E67600012|U11800009)|EN42900009)|4(?:P(?:K56200004|T58300001)|NZ3120000[14])|3(?:K(?:M62000002|N56600006)|MP99100007)|7(?:YF7580000[26]|WH37400001)|6VE68000007)$|5(?:(?:R(?:(?:C215|V113)00005|Q72200009)|TD96200001)$|Q(?:L93000006$|D96300001$))))|000(?:0(?:7(?:963(?:980000[346]|6400032)|(?:22271|79532)00002|3493600001|5653500003)|8(?:9(?:200400008|333100004)|0208100008|4915300001))|1(?:8(?:2731600010|3622600003|7174000041|8768500007)|7(?:0829800023|8916200002)|65505200002)|2(?:(?:(?:487747|712363)0000|9063690002)1|07930500002))$))/../^EX/)' ../source_files/all_data_clean_tags > ../w1/welle1_bloecke
grep "T9" welle1_bloecke | uniq -c | wc -l
# "welle1_bloecke" contains all information on 197 found articels

# the wave1 file is now separated into single authors/nobelists
rm ../w1/startfiles/*
perl -e 'use FindBin; my ($author, $outfile, $ut); while(<>) { $author = $1 if /.+;\d+;\d+;(\w+)/; $ut = $1 if /(.+);\d+;\d+;\w+/; $outfile = "$FindBin::Bin/../w1/startfiles/$author"; open FILE, ">>", $outfile; print FILE "UT $ut\n";}' ../source_files/UT\ Jahr\ Nobelist.csv

# based on 23 authors files we create 23 optimized regex
perl -e 'use FindBin; use Regexp::Assemble; my ($ra,$dir,$file,$outfile); $dir ="$FindBin::Bin/../w1/startfiles"; opendir(DIR, $dir); while(my $file = readdir(DIR)) { if ($file =~ /[^.]/) { $ra = Regexp::Assemble->new; $ra->add_file( "$dir/$file" ); $outfile = "$FindBin::Bin/../w1/rxo/$file"; open(FILE, "> $outfile");  print FILE $ra;}}'

# parsing the wave1 data for the 23 authors articles
perl -i -p -e 's/^EX/EX_THE_END/g;' ../w1/welle1_bloecke #ersetzt EX durch EX_THE_END, da im parser mit dem $/ operator gearbeitet wird, der keine regex versteht, d.h. es braucht einen unique string zum separieren
perl wave_parser.pl ../w1/welle1_bloecke w1

# saving these blocks as starting files
FILES=../w1/blocks/*
for file in $FILES; do
NAME=$(basename "$file")
MYPATH="../w1/startfiles/$NAME"
perl -ne "print if /^T9\s+/;" "$file" > $MYPATH
done

# fetching R9 tags from w1. These are cited articles
# after "|" we rename R9 to T9. this we can then feed as wave2
perl -ne 'print if (/^R9/)' ../w1/welle1_bloecke | perl -p -i -e 's/R9/^T9/g' > ../w2/welle2

# now the same procedure for w2
perl -e 'use Regexp::Assemble; my $ra = Regexp::Assemble->new; $ra->add_file( "../w2/welle2" ); print $ra->re;' > ../w2/welle2.rxo

# fetching subset of data containing all w2 articles
T="$(date +%s)"
perl blockinator.pl ../source_files/all_data_clean_tags ../w2/welle2.rxo > ../w2/welle2_bloecke
T="$(($(date +%s)-T))"
echo "Time in seconds: ${T}"

grep "T9" welle2_bloecke | uniq -c | wc -l

FILES=../w1/blocks/*
for file in $FILES; do
NAME=$(basename "$file")
MYPATH="../w2/startfiles/$NAME"
perl -ne "print if /^R9\s+/;" "$file" | perl -p -i -e 's/^R9/T9/g'> $MYPATH
done

# assembling regexes
perl -e 'use FindBin; use Regexp::Assemble; my ($ra,$dir,$file,$outfile); $dir ="$FindBin::Bin/../w2/startfiles"; opendir(DIR, $dir); while(my $file = readdir(DIR)) { if ($file =~ /[^.]/) { $ra = Regexp::Assemble->new; $ra->add_file( "$dir/$file" ); $outfile = "$FindBin::Bin/../w2/rxo/$file"; open(FILE, "> $outfile");  print FILE $ra;}}'

# separating into 23 author blocks
perl -i -p -e 's/^EX/EX_THE_END/g;' ../w2/welle2_bloecke #ersetzt EX durch EX_THE_END, da im parser mit dem $/ operator gearbeitet wird, der keine regex versteht, d.h. es braucht einen unique string zum separieren
perl wave_parser.pl ../w2/welle2_bloecke w2


# same for w3 

FILES=../w2/blocks/*
for file in $FILES; do
NAME=$(basename "$file")
MYPATH="../w3/startfiles/$NAME"
perl -ne "print if /^R9\s+/;" "$file" | perl -p -i -e 's/^R9/T9/g'> $MYPATH
done


# adding information on "being cited" per publication year

# for the complete WoS we collect R9 tags per publication year to count how often a paper was cited
# we further collect T9s to be able to exclude duplicated
perl -e 'my ($fn, $h8,$t9,$r9,$py); while(<>) { $fn = $1 of /^FN\s+(.+)/; $h8 = $1 if /^H8\s+(\d+)/; $py = $1 if /^PY\s+(\d+)/; $t9 = $1 if /^T9\s+(\d+)/; $r9 = $1 if /^R9\s+(\d+)/; print "$r9,$t9,$h8,$py\n" if (/^R9/)}' ../source_files/all_data_clean_tags > ../processed_files/wos_FNR9T9H8PY.csv

# we further collect SC to be able to see differenced between scientific fields
perl citations_by_field.pl ../source_files/all_data_clean_tags > ../processed_files/wos_R9T9H8PYSC.csv

# optimized regex for w1. restricting to T9. renaming T9 in R9, then assemble regex
grep "^T9" ../w1/welle1_bloecke | perl -p -i -e 's/T9 //g' > ../w1/w1
perl -e 'use Regexp::Assemble; my $ra = Regexp::Assemble->new; $ra->add_file( "../w1/w1" ); print $ra->re;' > ../w1/w1.rxo

# manualle added a "^" in the beginning to define the regex as starting at the beginning of lines
perl -ne 'print if (/^(?^:0(?:0(?:0(?:0(?:9(?:27(?:6(?:52|87)|70[67]|846)|78(?:8(?:29|38)|911)|08067|47599)|0(?:7(?:2(?:857|915)|6538)|4(?:5628|7651|9153)|09443|64088)|1(?:4(?:7(?:38[89]|490)|3818)|81(?:4(?:33|77)|392|884))|4(?:7(?:5(?:(?:86|93)5|594)|36(?:06|21))|42659)|5(?:5(?:261[47]|5514)|22377|43142)|2(?:0(?:24(?:73|90)|9455)|19067)|697(?:7(?:82|95)|867)|8(?:05980|70507)|325012)|1(?:4(?:28(?:5(?:40|63)|46[018])|32403|41866|59017)|1(?:6(?:4913|5144)|7(?:292|836)0|98348)|7(?:699(?:[02]9|30)|08087|15262|98139)|5(?:0(?:41(?:47|96)|1946|5321)|67067)|6(?:(?:0816|6821)0|941(?:17|33))|9(?:641(?:08|10)|25688|41883)|0(?:5(?:0813|9291)|16368)|2(?:30767|78753)|3(?:83828|97890)|884248)|3(?:0(?:989(?:15|28)|40603)|3(?:3405[07]|63031)|55(?:6579|7537)|68368[35]|125565|405176|747744|821243|937190)|2(?:6(?:5(?:1965|2022)|37118|45577)|9(?:6(?:314|862)4|42577)|013503|421707|863874)|4(?:(?:14355|58854)8|2(?:16799|51282)|6(?:21009|97815)|040744|337203|731565)|7(?:(?:09899|39720)7|(?:41074|97720)5|281160|520458|845912)|5(?:16(?:0524|3596)|568089|794351|883275)|6(?:134154|538070|759515)|8(?:000623|731956)|9(?:568383|962057))|1(?:4(?:(?:31130|52313)0|28267[89]|427759)|7(?:(?:03324|21429)1|851254)|543(?:7713|8011)|61(?:754|822)07|0729968|1089452)|6(?:22(?:958(?:3[58]|41)|14306)|(?:143823|487445)6|0262468|6761050)|7(?:5(?:617201|812725)|8(?:046411|727679)|7981620)|9(?:(?:220217|677069)7|4(?:380165|900884))|5(?:37685(?:65|83)|6918385|8370438)|4(?:1670352|3497407)|2(?:511107|970795)8|3284370[68])|1(?:(?:1623298|4183183)6|00564108|31083371)))/)' ../processed_files/wos_R9T9H8PYSC.csv > ../processed_files/w1_sc.csv
